{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "\n",
    "import os\n",
    "import jieba\n",
    "import re\n",
    "import math\n",
    "import time\n",
    "from numpy import linalg as la\n",
    "import logging\n",
    "from collections import defaultdict\n",
    "from functools import wraps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "say_words = ['诊断', '交代', '说', '说道', '指出','报道','报道说','称', '警告',\n",
    "           '所说', '告诉', '声称', '表示', '时说', '地说', '却说', '问道', '写道', \n",
    "           '答道', '感叹', '谈到', '说出', '认为', '提到', '强调', '宣称', '表明', \n",
    "           '明确指出', '所言', '所述', '所称', '所指', '常说', '断言', '名言', '告知', \n",
    "           '询问', '知道', '得知', '质问', '问', '告诫', '坚称', '辩称', '否认', '还称', \n",
    "           '指责', '透露', '坦言', '表达', '中说', '中称', '他称', '地问', '地称', '地用',\n",
    "           '地指', '脱口而出', '一脸', '直说', '说好', '反问', '责怪', '放过', '慨叹', '问起',\n",
    "           '喊道', '写到', '如是说', '何况', '答', '叹道', '岂能', '感慨', '叹', '赞叹', '叹息',\n",
    "           '自叹', '自言', '谈及', '谈起', '谈论', '特别强调', '提及', '坦白', '相信', '看来', \n",
    "           '觉得', '并不认为', '确信', '提过', '引用', '详细描述', '详述', '重申', '阐述', '阐释',\n",
    "           '承认', '说明', '证实', '揭示', '自述', '直言', '深信', '断定', '获知', '知悉', '得悉', \n",
    "           '透漏', '追问', '明白', '知晓', '发觉', '察觉到', '察觉', '怒斥', '斥责', '痛斥', '指摘',\n",
    "           '回答', '请问', '坚信', '一再强调', '矢口否认', '反指', '坦承', '指证', '供称', '驳斥', \n",
    "           '反驳', '指控', '澄清', '谴责', '批评', '抨击', '严厉批评', '诋毁', '责难', '忍不住', \n",
    "           '大骂', '痛骂', '问及', '阐明']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyltp import Segmentor\n",
    "from pyltp import Postagger\n",
    "from pyltp import Parser\n",
    "from pyltp import NamedEntityRecognizer\n",
    "from pyltp import SentenceSplitter\n",
    "\n",
    "\n",
    "cws_model_path = 'H:\\\\JupyterCode\\\\NLP\\\\ltp_data_v3.4.0\\\\cws.model'\n",
    "pos_model_path = 'H:\\\\JupyterCode\\\\NLP\\\\ltp_data_v3.4.0\\\\pos.model'\n",
    "par_model_path = 'H:\\\\JupyterCode\\\\NLP\\\\ltp_data_v3.4.0\\\\parser.model'\n",
    "ner_model_path = 'H:\\\\JupyterCode\\\\NLP\\\\ltp_data_v3.4.0\\\\ner.model'\n",
    "\n",
    "#初始化\n",
    "segmentor = Segmentor()#分词\n",
    "postagger = Postagger()#词性标注\n",
    "recognizer = NamedEntityRecognizer()#命名主体识别\n",
    "parser = Parser()#依存分析\n",
    "segmentor.load(cws_model_path)\n",
    "postagger.load(pos_model_path)  \n",
    "recognizer.load(ner_model_path)  \n",
    "parser.load(par_model_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sen 今天很热，小王说明天不想去上班了。\n",
      "['今天', '很', '热', '，', '小王', '说', '明天', '不', '想', '去', '上班', '了', '。']\n",
      "['O', 'O', 'O', 'O', 'S-Nh', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n",
      "3:ADV 3:ADV 0:HED 3:WP 6:SBV 3:COO 9:ADV 9:ADV 6:VOB 11:ADV 9:VOB 11:RAD 3:WP\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "('小王', '明天不想去上班了。')"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def parse_sentence(sentence,  ws=False):\n",
    "        #sentence = ' '.join([x for x in sentence.split('，') if x])\n",
    "        print(\"sen\", sentence)\n",
    "        cuts = list(segmentor.segment(sentence))  # pyltp分词\n",
    "        # 判断是否有‘说’相关词：\n",
    "        #print(cuts)\n",
    "        mixed = [word for word in cuts if word in say_words]\n",
    "        #print(\"mixed  \",mixed)\n",
    "        if not mixed : return False\n",
    "        ne = get_name_entity(sentence) #命名实体\n",
    "        wp = parsing(sentence) #依存分析\n",
    "        wp_relation = [w.relation for w in wp]\n",
    "        postags = list(postagger.postag(cuts))\n",
    "        name = ''\n",
    "        stack = [] \n",
    "        for k, v in enumerate(wp):\n",
    "            # save the most recent Noun\n",
    "            if postags[k] in ['nh', 'ni', 'ns']:\n",
    "                stack.append(cuts[k])\n",
    "            if v.relation=='SBV' and (cuts[v.head-1] in mixed) : #确定第一个主谓句\n",
    "                name = get_name(cuts[k], cuts[v.head-1], cuts, wp_relation,ne)\n",
    "                saying = get_saying(cuts, wp_relation, [i.head for i in wp], v.head)\n",
    "                if not saying:\n",
    "                    quotations = re.findall(r'“(.+?)”', sentence)\n",
    "                    if quotations: says = quotations[-1]\n",
    "                return name, saying\n",
    "            # 若找到‘：’后面必定为言论。\n",
    "            if cuts[k] == '：': \n",
    "                name = stack.pop()\n",
    "                saying = ''.join(cuts[k+1:])\n",
    "                return name, saying\n",
    "        return False\n",
    "    \n",
    "parse_sentence(\"今天很热，小王说明天不想去上班了。\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#命名实体识别\n",
    "def get_name_entity(sentence):\n",
    "    #sentence = ''.join(strs)\n",
    "    words = segmentor.segment(sentence)\n",
    "    postags = postagger.postag(words) #词性标注\n",
    "    netags = recognizer.recognize(words, postags) #命名实体识别\n",
    "    print(list(netags))\n",
    "    return netags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 句子依存分析\n",
    "def parsing(sentence):\n",
    "    words = segmentor.segment(sentence)  # pyltp分词\n",
    "    postags = postagger.postag(words)  # 词性标注\n",
    "    arcs = parser.parse(words, postags)  # 句法分析\n",
    "    print(' '.join(\"%d:%s\" % (arc.head, arc.relation) for arc in arcs))\n",
    "    return arcs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 输入主语第一个词语、谓语、词语数组、词性数组，查找完整主语\n",
    "def get_name( name, predic, words, property, ne):\n",
    "    index = words.index(name)\n",
    "    cut_property = property[index + 1:] #截取到name后第一个词语\n",
    "    pre=words[:index]#前半部分\n",
    "    pos=words[index+1:]#后半部分\n",
    "    #向前拼接主语的定语\n",
    "    while pre:\n",
    "        w = pre.pop(-1)\n",
    "        w_index = words.index(w)\n",
    "\n",
    "        if property[w_index] == 'ADV': continue\n",
    "        if property[w_index] in ['WP', 'ATT', 'SVB'] and (w not in ['，','。','、','）','（']):\n",
    "            name = w + name\n",
    "        else:\n",
    "            pre = False\n",
    "\n",
    "    while pos:\n",
    "        w = pos.pop(0)\n",
    "        p = cut_property.pop(0)\n",
    "        if p in ['WP', 'LAD', 'COO', 'RAD'] and w != predic and (w not in ['，', '。', '、', '）', '（']):\n",
    "            name = name + w # 向后拼接\n",
    "        else: #中断拼接直接返回\n",
    "            return name\n",
    "    return name\n",
    "\n",
    "# 获取谓语之后的言论\n",
    "def get_saying(sentence, proper, heads, pos):\n",
    "    # word = sentence.pop(0) #谓语\n",
    "    if '：' in sentence:\n",
    "        return ''.join(sentence[sentence.index('：')+1:])\n",
    "    while pos < len(sentence):\n",
    "        w = sentence[pos]\n",
    "        p = proper[pos]\n",
    "        h = heads[pos]\n",
    "        # 谓语尚未结束\n",
    "        if p in ['DBL', 'CMP', 'RAD']:\n",
    "            pos += 1\n",
    "            continue\n",
    "        # 定语\n",
    "        if p == 'ATT' and proper[h-1] != 'SBV':\n",
    "            pos = h\n",
    "            continue\n",
    "        # 宾语\n",
    "        if p == 'VOB':\n",
    "            pos += 1\n",
    "            continue\n",
    "        # if p in ['ATT', 'VOB', 'DBL', 'CMP']:  # 遇到此性质代表谓语未结束，continue\n",
    "        #    continue\n",
    "        else:\n",
    "            if w == '，':\n",
    "                return ''.join(sentence[pos+1:])\n",
    "            else:\n",
    "                return ''.join(sentence[pos:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6\n"
     ]
    }
   ],
   "source": [
    "a = [1,2,3]\n",
    "b = sum(a)\n",
    "print(b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['小王', '说', '因为', '天气', '太', '热', '不', '想', '去', '上班', '了', '。', '他', '宣称', '这么', '热', '的', '天', '去', '上班', '可能', '会', '中', '暑', '。']\n"
     ]
    }
   ],
   "source": [
    "corpus_1 = ['小王说因为天气太热不想去上班了。他宣称这么热的天去上班可能会中暑。']\n",
    "k = segmentor.segment(corpus_1[0])\n",
    "print(list(k))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import gensim\n",
    "import numpy as np\n",
    "#对于连续的句子。\n",
    "#用词向量的均值代表句子向量 \n",
    "#假设有这么两句话，\n",
    "a = '小王说因为天气太热不想去上班了。他这么热的天去上班可能会中暑。'\n",
    "b = '小王说因为天气太热不想去上班了。小李向老板举报了他'\n",
    "\n",
    "model = gensim.models.Word2Vec(wiki)\n",
    "sen_1 = '小王说因为天气太热不想去上班了。'\n",
    "words_1 = list(segmentor.segment(sen_1))\n",
    "sen_1_vec = sum([model[k] for k in words_1]) / len(words_1)\n",
    "                   \n",
    "sen_2 = '这么热的天去上班可能会中暑。'\n",
    "words_2 = list(segmentor.segment(sen_2))\n",
    "sen_2_vec = sum([model[k] for k in words_2]) / len(words_2)\n",
    "\n",
    "sen_3 = '自己更愿意去打游戏。'\n",
    "words_3 = list(segmentor.segment(sen_3))\n",
    "sen_3_vec = sum([model[k] for k in words_3]) / len(words_3)\n",
    "\n",
    "\n",
    "def sen_vec(sentence):\n",
    "    words = list(segmentor.segment(sen))\n",
    "    sen_vec = np.sum([model[k] for k in words],axis = 1) / len(words)\n",
    "    return sen_vec\n",
    "                   \n",
    "def cosine_dis(vec1, vec2):\n",
    "    return np.dot(vector1,vector2)/(np.linalg.norm(vector1)*(np.linalg.norm(vector2)))\n",
    "\n",
    "cosine_dis(sen_1_vec, sen_2_vec)\n",
    "cosine_dis(sen_1_vec, sen_3_vec)\n",
    "                   \n",
    "expect = 0.7    \n",
    "if cosine_dis(sen_vec(sen_1), sen_vec(sen_2)) > expect:\n",
    "    sen = sen1+sen2\n",
    "     \n",
    "              "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# updating\n",
    "# 去掉停用词\n",
    "# 用tfidf值作为权重。\n",
    "\n",
    "sen_1_vec = sum([model[k] * tfidf[k] for k in words_1 if k not in stop_words]) / len()   "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
